In [1]:
#LIBRARIES
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import chart_studio.plotly as py
import plotly.graph_objs as go
import plotly.express as px
import folium
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
In [2]:
import os # to call files or use files and datasets from the local drive
In [3]:
os.listdir(r"N:\datasets\Datasets") #this list calls all the files from the os
Out[3]:
['.ipynb_checkpoints', 'other-American_B01362.csv', 'other-Carmel_B00256.csv', 'other-Dial7_B00887.csv', 'other-Diplo_B01196.csv', 'other-Federal_02216.csv', 'other-FHV-services_jan-aug-2015.csv', 'other-Firstclass_B01536.csv', 'other-Highclass_B01717.csv', 'other-Lyft_B02510.csv', 'other-Prestige_B01338.csv', 'other-Skyline_B00111.csv', 'Uber-Jan-Feb-FOIL.csv', 'uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-janjune-15.csv', 'uber-raw-data-janjune-15_sample.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv', 'Untitled1.ipynb']
here R or r is used to read all the files even if there is error is in backslash or frontslash¶
In [5]:
df = pd.read_csv(r"N:\datasets\Datasets/uber-raw-data-janjune-15_sample.csv")
In [6]:
df #print the data frame
Out[6]:
| Dispatching_base_num | Pickup_date | Affiliated_base_num | locationID | |
|---|---|---|---|---|
| 0 | B02617 | 2015-05-02 21:43:00 | B02764 | 237 |
| 1 | B02682 | 2015-01-20 19:52:59 | B02682 | 231 |
| 2 | B02617 | 2015-03-19 20:26:00 | B02617 | 161 |
| 3 | B02764 | 2015-04-10 17:38:00 | B02764 | 107 |
| 4 | B02764 | 2015-03-23 07:03:00 | B00111 | 140 |
| ... | ... | ... | ... | ... |
| 99995 | B02764 | 2015-04-13 16:12:00 | B02764 | 234 |
| 99996 | B02764 | 2015-03-06 21:32:00 | B02764 | 24 |
| 99997 | B02598 | 2015-03-19 19:56:00 | B02598 | 17 |
| 99998 | B02682 | 2015-05-02 16:02:00 | B02682 | 68 |
| 99999 | B02764 | 2015-06-24 16:04:00 | B02764 | 125 |
100000 rows × 4 columns
In [7]:
df.shape #printing the shape to check the data frame
Out[7]:
(100000, 4)
Data preprocessing ot data cleaning¶
In [9]:
df.duplicated
Out[9]:
<bound method DataFrame.duplicated of Dispatching_base_num Pickup_date Affiliated_base_num \
0 B02617 2015-05-02 21:43:00 B02764
1 B02682 2015-01-20 19:52:59 B02682
2 B02617 2015-03-19 20:26:00 B02617
3 B02764 2015-04-10 17:38:00 B02764
4 B02764 2015-03-23 07:03:00 B00111
... ... ... ...
99995 B02764 2015-04-13 16:12:00 B02764
99996 B02764 2015-03-06 21:32:00 B02764
99997 B02598 2015-03-19 19:56:00 B02598
99998 B02682 2015-05-02 16:02:00 B02682
99999 B02764 2015-06-24 16:04:00 B02764
locationID
0 237
1 231
2 161
3 107
4 140
... ...
99995 234
99996 24
99997 17
99998 68
99999 125
[100000 rows x 4 columns]>
df.duplicated() is typically used when you want to directly access the boolean Series indicating duplicates.¶
In [11]:
df.duplicated() # when you add paranthesis it consideres itself into two dimensional true or flase types
Out[11]:
0 False
1 False
2 False
3 False
4 False
...
99995 False
99996 False
99997 False
99998 False
99999 False
Length: 100000, dtype: bool
df.duplicated().sum() is used to count the total number of duplicate rows in the DataFrame, which can be useful for reporting or further processing¶
In [13]:
df.duplicated().sum()
Out[13]:
54
drop_duplicates,To drop all the duplicates and here inplace refers as to whatever places it is false it make true and rearrages the dataframe¶
In [15]:
df.drop_duplicates(inplace=True)
In [16]:
df1 = df.copy()
In [17]:
df1.drop_duplicates(inplace = True)
In [18]:
df1.shape # recheck again if the duplicates are removed and shape is
Out[18]:
(99946, 4)
In [19]:
df1
Out[19]:
| Dispatching_base_num | Pickup_date | Affiliated_base_num | locationID | |
|---|---|---|---|---|
| 0 | B02617 | 2015-05-02 21:43:00 | B02764 | 237 |
| 1 | B02682 | 2015-01-20 19:52:59 | B02682 | 231 |
| 2 | B02617 | 2015-03-19 20:26:00 | B02617 | 161 |
| 3 | B02764 | 2015-04-10 17:38:00 | B02764 | 107 |
| 4 | B02764 | 2015-03-23 07:03:00 | B00111 | 140 |
| ... | ... | ... | ... | ... |
| 99995 | B02764 | 2015-04-13 16:12:00 | B02764 | 234 |
| 99996 | B02764 | 2015-03-06 21:32:00 | B02764 | 24 |
| 99997 | B02598 | 2015-03-19 19:56:00 | B02598 | 17 |
| 99998 | B02682 | 2015-05-02 16:02:00 | B02682 | 68 |
| 99999 | B02764 | 2015-06-24 16:04:00 | B02764 | 125 |
99946 rows × 4 columns
Here im checking what kind of data is availabe in each type of columns¶
In [21]:
df1.dtypes
Out[21]:
Dispatching_base_num object Pickup_date object Affiliated_base_num object locationID int64 dtype: object
Here im checking if isnull is making meaning if there are any null values¶
In [23]:
df1.isnull().sum()
Out[23]:
Dispatching_base_num 0 Pickup_date 0 Affiliated_base_num 1116 locationID 0 dtype: int64
In [24]:
df1.isnull()
Out[24]:
| Dispatching_base_num | Pickup_date | Affiliated_base_num | locationID | |
|---|---|---|---|---|
| 0 | False | False | False | False |
| 1 | False | False | False | False |
| 2 | False | False | False | False |
| 3 | False | False | False | False |
| 4 | False | False | False | False |
| ... | ... | ... | ... | ... |
| 99995 | False | False | False | False |
| 99996 | False | False | False | False |
| 99997 | False | False | False | False |
| 99998 | False | False | False | False |
| 99999 | False | False | False | False |
99946 rows × 4 columns
Checking the date and time¶
In [26]:
df1['Pickup_date'][0]
Out[26]:
'2015-05-02 21:43:00'
Also checking the type of the date and time is it a str or int or it should be in datetime format¶
In [28]:
type(df1['Pickup_date'][0])
Out[28]:
str
This is the way to convert a pandas to datetime pd.to_datetime(df['Pickup_date'])¶
In [30]:
pd.to_datetime(df1['Pickup_date'])
Out[30]:
0 2015-05-02 21:43:00
1 2015-01-20 19:52:59
2 2015-03-19 20:26:00
3 2015-04-10 17:38:00
4 2015-03-23 07:03:00
...
99995 2015-04-13 16:12:00
99996 2015-03-06 21:32:00
99997 2015-03-19 19:56:00
99998 2015-05-02 16:02:00
99999 2015-06-24 16:04:00
Name: Pickup_date, Length: 99946, dtype: datetime64[ns]
In [31]:
df1['Pickup_date'] = pd.to_datetime(df1['Pickup_date'])
In [32]:
df1['Pickup_date'].dtype
Out[32]:
dtype('<M8[ns]')
In [33]:
df1['Pickup_date'][0] #So now its got converted to a timestamp for the normal time
Out[33]:
Timestamp('2015-05-02 21:43:00')
In [34]:
type(df1['Pickup_date'][0]) # now the pandas is converted it to the timestamp
Out[34]:
pandas._libs.tslibs.timestamps.Timestamp
In [35]:
df1.dtypes #here now if you see the pickupdate is converted from an object to timestamp
Out[35]:
Dispatching_base_num object Pickup_date datetime64[ns] Affiliated_base_num object locationID int64 dtype: object
Categrorical data can be either object or bool it can be taken as groups/ categroies. numerical data is either integer which is discrete, float which is a continuous¶
problem statement which month is the highest pickup of uber in NEW YORK¶
In [38]:
df1
Out[38]:
| Dispatching_base_num | Pickup_date | Affiliated_base_num | locationID | |
|---|---|---|---|---|
| 0 | B02617 | 2015-05-02 21:43:00 | B02764 | 237 |
| 1 | B02682 | 2015-01-20 19:52:59 | B02682 | 231 |
| 2 | B02617 | 2015-03-19 20:26:00 | B02617 | 161 |
| 3 | B02764 | 2015-04-10 17:38:00 | B02764 | 107 |
| 4 | B02764 | 2015-03-23 07:03:00 | B00111 | 140 |
| ... | ... | ... | ... | ... |
| 99995 | B02764 | 2015-04-13 16:12:00 | B02764 | 234 |
| 99996 | B02764 | 2015-03-06 21:32:00 | B02764 | 24 |
| 99997 | B02598 | 2015-03-19 19:56:00 | B02598 | 17 |
| 99998 | B02682 | 2015-05-02 16:02:00 | B02682 | 68 |
| 99999 | B02764 | 2015-06-24 16:04:00 | B02764 | 125 |
99946 rows × 4 columns
dt.month is used to return months in the form of numbers¶
In [40]:
df1['Pickup_date'].dt.month
Out[40]:
0 5
1 1
2 3
3 4
4 3
..
99995 4
99996 3
99997 3
99998 5
99999 6
Name: Pickup_date, Length: 99946, dtype: int32
In [41]:
df1['month'] = df1['Pickup_date'].dt.month_name()# it prints all the months name
In [42]:
df1['month']
Out[42]:
0 May
1 January
2 March
3 April
4 March
...
99995 April
99996 March
99997 March
99998 May
99999 June
Name: month, Length: 99946, dtype: object
In [43]:
df1['month'].value_counts()
Out[43]:
month June 19620 May 18660 April 15982 March 15969 February 15896 January 13819 Name: count, dtype: int64
In [44]:
df1['month'].value_counts().plot(kind = 'bar')
Out[44]:
<Axes: xlabel='month'>
for different days to execute¶
In [46]:
df1['Pickup_date'].dt.day_name()
Out[46]:
0 Saturday
1 Tuesday
2 Thursday
3 Friday
4 Monday
...
99995 Monday
99996 Friday
99997 Thursday
99998 Saturday
99999 Wednesday
Name: Pickup_date, Length: 99946, dtype: object
for hourly¶
In [48]:
df1['Pickup_date'].dt.hour
Out[48]:
0 21
1 19
2 20
3 17
4 7
..
99995 16
99996 21
99997 19
99998 16
99999 16
Name: Pickup_date, Length: 99946, dtype: int32
for every minute¶
In [50]:
df1['Pickup_date'].dt.minute
Out[50]:
0 43
1 52
2 26
3 38
4 3
..
99995 12
99996 32
99997 56
99998 2
99999 4
Name: Pickup_date, Length: 99946, dtype: int32
printing the first 5 values¶
In [52]:
df1.head(5)
Out[52]:
| Dispatching_base_num | Pickup_date | Affiliated_base_num | locationID | month | |
|---|---|---|---|---|---|
| 0 | B02617 | 2015-05-02 21:43:00 | B02764 | 237 | May |
| 1 | B02682 | 2015-01-20 19:52:59 | B02682 | 231 | January |
| 2 | B02617 | 2015-03-19 20:26:00 | B02617 | 161 | March |
| 3 | B02764 | 2015-04-10 17:38:00 | B02764 | 107 | April |
| 4 | B02764 | 2015-03-23 07:03:00 | B00111 | 140 | March |
printing the last five values¶
In [54]:
df1.tail(5)
Out[54]:
| Dispatching_base_num | Pickup_date | Affiliated_base_num | locationID | month | |
|---|---|---|---|---|---|
| 99995 | B02764 | 2015-04-13 16:12:00 | B02764 | 234 | April |
| 99996 | B02764 | 2015-03-06 21:32:00 | B02764 | 24 | March |
| 99997 | B02598 | 2015-03-19 19:56:00 | B02598 | 17 | March |
| 99998 | B02682 | 2015-05-02 16:02:00 | B02682 | 68 | May |
| 99999 | B02764 | 2015-06-24 16:04:00 | B02764 | 125 | June |
In [55]:
df1['Day_of_Week'] = df1['Pickup_date'].dt.day_name()
to do cross tabulation¶
In [57]:
df1['weekday'] = df1['Pickup_date'].dt.day_name()
pivot = pd.crosstab(index=df1['month'], columns=df1['weekday'])
In [58]:
pivot
Out[58]:
| weekday | Friday | Monday | Saturday | Sunday | Thursday | Tuesday | Wednesday |
|---|---|---|---|---|---|---|---|
| month | |||||||
| April | 2365 | 1833 | 2508 | 2052 | 2823 | 1880 | 2521 |
| February | 2655 | 1970 | 2550 | 2183 | 2396 | 2129 | 2013 |
| January | 2508 | 1353 | 2745 | 1651 | 2378 | 1444 | 1740 |
| June | 2793 | 2848 | 3037 | 2485 | 2767 | 3187 | 2503 |
| March | 2465 | 2115 | 2522 | 2379 | 2093 | 2388 | 2007 |
| May | 3262 | 1865 | 3519 | 2944 | 2627 | 2115 | 2328 |
In [59]:
pivot.plot(kind='bar', figsize=(8,6))
Out[59]:
<Axes: xlabel='month'>
Analying the rush hour in new york [Data Analysis]¶
In [61]:
pivot.plot(kind='line', figsize=(8,6))
Out[61]:
<Axes: xlabel='month'>
In [62]:
df1['hour'] = df1['Pickup_date'].dt.hour
In [63]:
summary = df1.groupby(['weekday', 'hour'], as_index=False).size()
In [64]:
summary
Out[64]:
| weekday | hour | size | |
|---|---|---|---|
| 0 | Friday | 0 | 581 |
| 1 | Friday | 1 | 333 |
| 2 | Friday | 2 | 197 |
| 3 | Friday | 3 | 138 |
| 4 | Friday | 4 | 161 |
| ... | ... | ... | ... |
| 163 | Wednesday | 19 | 1044 |
| 164 | Wednesday | 20 | 897 |
| 165 | Wednesday | 21 | 949 |
| 166 | Wednesday | 22 | 900 |
| 167 | Wednesday | 23 | 669 |
168 rows × 3 columns
In [65]:
plt.figure(figsize=(8, 6))
sns.pointplot(x="hour", y="size", hue="weekday", data=summary)
plt.show()
Which base number has the most amount of active vechiles??¶
In [67]:
df1.columns
Out[67]:
Index(['Dispatching_base_num', 'Pickup_date', 'Affiliated_base_num',
'locationID', 'month', 'Day_of_Week', 'weekday', 'hour'],
dtype='object')
In [68]:
os.listdir(r"N:\datasets\Datasets") #this list calls all the files from the os
Out[68]:
['.ipynb_checkpoints', 'other-American_B01362.csv', 'other-Carmel_B00256.csv', 'other-Dial7_B00887.csv', 'other-Diplo_B01196.csv', 'other-Federal_02216.csv', 'other-FHV-services_jan-aug-2015.csv', 'other-Firstclass_B01536.csv', 'other-Highclass_B01717.csv', 'other-Lyft_B02510.csv', 'other-Prestige_B01338.csv', 'other-Skyline_B00111.csv', 'Uber-Jan-Feb-FOIL.csv', 'uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-janjune-15.csv', 'uber-raw-data-janjune-15_sample.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv', 'Untitled1.ipynb']
In [69]:
df_foil = pd.read_csv(r'N:\datasets\Datasets\Uber-Jan-Feb-FOIL.csv')
In [70]:
df_foil
Out[70]:
| dispatching_base_number | date | active_vehicles | trips | |
|---|---|---|---|---|
| 0 | B02512 | 1/1/2015 | 190 | 1132 |
| 1 | B02765 | 1/1/2015 | 225 | 1765 |
| 2 | B02764 | 1/1/2015 | 3427 | 29421 |
| 3 | B02682 | 1/1/2015 | 945 | 7679 |
| 4 | B02617 | 1/1/2015 | 1228 | 9537 |
| ... | ... | ... | ... | ... |
| 349 | B02764 | 2/28/2015 | 3952 | 39812 |
| 350 | B02617 | 2/28/2015 | 1372 | 14022 |
| 351 | B02682 | 2/28/2015 | 1386 | 14472 |
| 352 | B02512 | 2/28/2015 | 230 | 1803 |
| 353 | B02765 | 2/28/2015 | 747 | 7753 |
354 rows × 4 columns
In [71]:
df_foil.head()
Out[71]:
| dispatching_base_number | date | active_vehicles | trips | |
|---|---|---|---|---|
| 0 | B02512 | 1/1/2015 | 190 | 1132 |
| 1 | B02765 | 1/1/2015 | 225 | 1765 |
| 2 | B02764 | 1/1/2015 | 3427 | 29421 |
| 3 | B02682 | 1/1/2015 | 945 | 7679 |
| 4 | B02617 | 1/1/2015 | 1228 | 9537 |
In [72]:
init_notebook_mode(connected=True)
In [73]:
df_foil.columns
Out[73]:
Index(['dispatching_base_number', 'date', 'active_vehicles', 'trips'], dtype='object')
BOXPLOT USING PLOTLY¶
In [75]:
px.box(x='dispatching_base_number', y='active_vehicles', data_frame=df_foil)
Violin plot -> which means distance + box¶
In [77]:
px.violin(x='dispatching_base_number', y='active_vehicles', data_frame=df_foil)
In [78]:
os.listdir(r"N:\datasets\Datasets") #this list calls all the files from the os
Out[78]:
['.ipynb_checkpoints', 'other-American_B01362.csv', 'other-Carmel_B00256.csv', 'other-Dial7_B00887.csv', 'other-Diplo_B01196.csv', 'other-Federal_02216.csv', 'other-FHV-services_jan-aug-2015.csv', 'other-Firstclass_B01536.csv', 'other-Highclass_B01717.csv', 'other-Lyft_B02510.csv', 'other-Prestige_B01338.csv', 'other-Skyline_B00111.csv', 'Uber-Jan-Feb-FOIL.csv', 'uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-janjune-15.csv', 'uber-raw-data-janjune-15_sample.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv', 'Untitled1.ipynb']
Now lets combine data into one big data or some new data file¶
In [80]:
files = os.listdir(r"N:\datasets\Datasets")[-9:-1]
files
Out[80]:
['uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-janjune-15.csv', 'uber-raw-data-janjune-15_sample.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
In [81]:
files.remove('uber-raw-data-janjune-15.csv')
files
Out[81]:
['uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-janjune-15_sample.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
In [82]:
files.remove('uber-raw-data-janjune-15_sample.csv')
files
Out[82]:
['uber-raw-data-apr14.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'uber-raw-data-may14.csv', 'uber-raw-data-sep14.csv']
Concating all these files together¶
In [84]:
final_df = pd.DataFrame()
path = r"N:\datasets\Datasets"
for file in files:
current_df = pd.read_csv(path+'/'+file)
final_df = pd.concat([current_df, final_df])
In [85]:
final_df.shape
Out[85]:
(4534327, 4)
In [86]:
final_df.duplicated().sum()
Out[86]:
82581
In [87]:
final_df.drop_duplicates(inplace = True)
In [88]:
final_df.shape
Out[88]:
(4451746, 4)
In [89]:
final_df
Out[89]:
| Date/Time | Lat | Lon | Base | |
|---|---|---|---|---|
| 0 | 9/1/2014 0:01:00 | 40.2201 | -74.0021 | B02512 |
| 1 | 9/1/2014 0:01:00 | 40.7500 | -74.0027 | B02512 |
| 2 | 9/1/2014 0:03:00 | 40.7559 | -73.9864 | B02512 |
| 3 | 9/1/2014 0:06:00 | 40.7450 | -73.9889 | B02512 |
| 4 | 9/1/2014 0:11:00 | 40.8145 | -73.9444 | B02512 |
| ... | ... | ... | ... | ... |
| 564511 | 4/30/2014 23:22:00 | 40.7640 | -73.9744 | B02764 |
| 564512 | 4/30/2014 23:26:00 | 40.7629 | -73.9672 | B02764 |
| 564513 | 4/30/2014 23:31:00 | 40.7443 | -73.9889 | B02764 |
| 564514 | 4/30/2014 23:32:00 | 40.6756 | -73.9405 | B02764 |
| 564515 | 4/30/2014 23:48:00 | 40.6880 | -73.9608 | B02764 |
4451746 rows × 4 columns
At what locations of new york city we are getting rush¶
In [91]:
rush_uber = final_df.groupby(['Lat','Lon'], as_index=False).size()
Folium is a map¶
In [93]:
basemap = folium.Map()
basemap
Out[93]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [94]:
from folium.plugins import HeatMap
In [95]:
HeatMap(rush_uber).add_to(basemap)
Out[95]:
<folium.plugins.heat_map.HeatMap at 0x2810b5952d0>
In [96]:
basemap
Out[96]:
Make this Notebook Trusted to load map: File -> Trust Notebook